/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 * 
 * http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

/*

Porter stemmer in Java. The original paper is in

Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
no. 3, pp 130-137,

See also http://www.tartarus.org/~martin/PorterStemmer/index.html

Bug 1 (reported by Gonzalo Parra 16/10/99) fixed as marked below.
Tthe words 'aed', 'eed', 'oed' leave k at 'a' for step 3, and b[k-1]
is then out outside the bounds of b.

Similarly,

Bug 2 (reported by Steve Dyrdahl 22/2/00) fixed as marked below.
'ion' by itself leaves j = -1 in the test for 'ion' in step 5, and
b[j] is then outside the bounds of b.

Release 3.

[ This version is derived from Release 3, modified by Brian Goetz to
optimize for fewer object creations.  ]
*/

using System;

namespace Lucene.Net.Analysis
{
	
    /// <summary> 
    /// Stemmer, implementing the Porter Stemming Algorithm
    /// 
    /// The Stemmer class transforms a word into its root form.  The input
    /// word can be provided a character at time (by calling add()), or at once
    /// by calling one of the various stem(something) methods.
    /// </summary>
	
    class PorterStemmer
    {
        private char[] b;
        private int i, j, k, k0;
        private bool dirty = false;
        private const int INC = 50; /* unit of size whereby b is increased */
        private const int EXTRA = 1;
		
        public PorterStemmer()
        {
            b = new char[INC];
            i = 0;
        }
		
        /// <summary> reset() resets the stemmer so it can stem another word.  If you invoke
        /// the stemmer by calling add(char) and then Stem(), you must call reset()
        /// before starting another word.
        /// </summary>
        public virtual void  Reset()
        {
            i = 0; dirty = false;
        }
		
        /// <summary> Add a character to the word being stemmed.  When you are finished
        /// adding characters, you can call Stem(void) to process the word.
        /// </summary>
        public virtual void  Add(char ch)
        {
            if (b.Length <= i + EXTRA)
            {
                char[] new_b = new char[b.Length + INC];
                Array.Copy(b, 0, new_b, 0, b.Length);
                b = new_b;
            }
            b[i++] = ch;
        }
		
        /// <summary> After a word has been stemmed, it can be retrieved by toString(),
        /// or a reference to the internal buffer can be retrieved by getResultBuffer
        /// and getResultLength (which is generally more efficient.)
        /// </summary>
        public override System.String ToString()
        {
            return new System.String(b, 0, i);
        }
		
        /// <summary> Returns the length of the word resulting from the stemming process.</summary>
        public virtual int GetResultLength()
        {
            return i;
        }
		
        /// <summary> Returns a reference to a character buffer containing the results of
        /// the stemming process.  You also need to consult getResultLength()
        /// to determine the length of the result.
        /// </summary>
        public virtual char[] GetResultBuffer()
        {
            return b;
        }
		
        /* cons(i) is true <=> b[i] is a consonant. */
		
        private bool Cons(int i)
        {
            switch (b[i])
            {
				
                case 'a': 
                case 'e': 
                case 'i': 
                case 'o': 
                case 'u': 
                    return false;
				
                case 'y': 
                    return (i == k0) ? true : !Cons(i - 1);
				
                default: 
                    return true;
				
            }
        }
		
        /* m() measures the number of consonant sequences between k0 and j. if c is
        a consonant sequence and v a vowel sequence, and <..> indicates arbitrary
        presence,
		
        <c><v>       gives 0
        <c>vc<v>     gives 1
        <c>vcvc<v>   gives 2
        <c>vcvcvc<v> gives 3
        ....
        */
		
        private int M()
        {
            int n = 0;
            int i = k0;
            while (true)
            {
                if (i > j)
                    return n;
                if (!Cons(i))
                    break;
                i++;
            }
            i++;
            while (true)
            {
                while (true)
                {
                    if (i > j)
                        return n;
                    if (Cons(i))
                        break;
                    i++;
                }
                i++;
                n++;
                while (true)
                {
                    if (i > j)
                        return n;
                    if (!Cons(i))
                        break;
                    i++;
                }
                i++;
            }
        }
		
        /* vowelinstem() is true <=> k0,...j contains a vowel */
		
        private bool Vowelinstem()
        {
            int i;
            for (i = k0; i <= j; i++)
                if (!Cons(i))
                    return true;
            return false;
        }
		
        /* doublec(j) is true <=> j,(j-1) contain a double consonant. */
		
        private bool Doublec(int j)
        {
            if (j < k0 + 1)
                return false;
            if (b[j] != b[j - 1])
                return false;
            return Cons(j);
        }
		
        /* cvc(i) is true <=> i-2,i-1,i has the form consonant - vowel - consonant
        and also if the second c is not w,x or y. this is used when trying to
        restore an e at the end of a short word. e.g.
		
        cav(e), lov(e), hop(e), crim(e), but
        snow, box, tray.
		
        */
		
        private bool Cvc(int i)
        {
            if (i < k0 + 2 || !Cons(i) || Cons(i - 1) || !Cons(i - 2))
                return false;
            else
            {
                int ch = b[i];
                if (ch == 'w' || ch == 'x' || ch == 'y')
                    return false;
            }
            return true;
        }
		
        private bool Ends(System.String s)
        {
            int l = s.Length;
            int o = k - l + 1;
            if (o < k0)
                return false;
            for (int i = 0; i < l; i++)
                if (b[o + i] != s[i])
                    return false;
            j = k - l;
            return true;
        }
		
        /* setto(s) sets (j+1),...k to the characters in the string s, readjusting
        k. */
		
        internal virtual void  Setto(System.String s)
        {
            int l = s.Length;
            int o = j + 1;
            for (int i = 0; i < l; i++)
                b[o + i] = s[i];
            k = j + l;
            dirty = true;
        }
		
        /* r(s) is used further down. */
		
        internal virtual void  R(System.String s)
        {
            if (M() > 0)
                Setto(s);
        }
		
        /* step1() gets rid of plurals and -ed or -ing. e.g.
		
        caresses  ->  caress
        ponies    ->  poni
        ties      ->  ti
        caress    ->  caress
        cats      ->  cat
		
        feed      ->  feed
        agreed    ->  agree
        disabled  ->  disable
		
        matting   ->  mat
        mating    ->  mate
        meeting   ->  meet
        milling   ->  mill
        messing   ->  mess
		
        meetings  ->  meet
		
        */
		
        private void  Step1()
        {
            if (b[k] == 's')
            {
                if (Ends("sses"))
                    k -= 2;
                else if (Ends("ies"))
                    Setto("i");
                else if (b[k - 1] != 's')
                    k--;
            }
            if (Ends("eed"))
            {
                if (M() > 0)
                    k--;
            }
            else if ((Ends("ed") || Ends("ing")) && Vowelinstem())
            {
                k = j;
                if (Ends("at"))
                    Setto("ate");
                else if (Ends("bl"))
                    Setto("ble");
                else if (Ends("iz"))
                    Setto("ize");
                else if (Doublec(k))
                {
                    int ch = b[k--];
                    if (ch == 'l' || ch == 's' || ch == 'z')
                        k++;
                }
                else if (M() == 1 && Cvc(k))
                    Setto("e");
            }
        }
		
        /* step2() turns terminal y to i when there is another vowel in the stem. */
		
        private void  Step2()
        {
            if (Ends("y") && Vowelinstem())
            {
                b[k] = 'i';
                dirty = true;
            }
        }
		
        /* step3() maps double suffices to single ones. so -ization ( = -ize plus
        -ation) maps to -ize etc. note that the string before the suffix must give
        m() > 0. */
		
        private void  Step3()
        {
            if (k == k0)
                return ; /* For Bug 1 */
            switch (b[k - 1])
            {
				
                case 'a': 
                    if (Ends("ational"))
                    {
                        R("ate"); break;
                    }
                    if (Ends("tional"))
                    {
                        R("tion"); break;
                    }
                    break;
				
                case 'c': 
                    if (Ends("enci"))
                    {
                        R("ence"); break;
                    }
                    if (Ends("anci"))
                    {
                        R("ance"); break;
                    }
                    break;
				
                case 'e': 
                    if (Ends("izer"))
                    {
                        R("ize"); break;
                    }
                    break;
				
                case 'l': 
                    if (Ends("bli"))
                    {
                        R("ble"); break;
                    }
                    if (Ends("alli"))
                    {
                        R("al"); break;
                    }
                    if (Ends("entli"))
                    {
                        R("ent"); break;
                    }
                    if (Ends("eli"))
                    {
                        R("e"); break;
                    }
                    if (Ends("ousli"))
                    {
                        R("ous"); break;
                    }
                    break;
				
                case 'o': 
                    if (Ends("ization"))
                    {
                        R("ize"); break;
                    }
                    if (Ends("ation"))
                    {
                        R("ate"); break;
                    }
                    if (Ends("ator"))
                    {
                        R("ate"); break;
                    }
                    break;
				
                case 's': 
                    if (Ends("alism"))
                    {
                        R("al"); break;
                    }
                    if (Ends("iveness"))
                    {
                        R("ive"); break;
                    }
                    if (Ends("fulness"))
                    {
                        R("ful"); break;
                    }
                    if (Ends("ousness"))
                    {
                        R("ous"); break;
                    }
                    break;
				
                case 't': 
                    if (Ends("aliti"))
                    {
                        R("al"); break;
                    }
                    if (Ends("iviti"))
                    {
                        R("ive"); break;
                    }
                    if (Ends("biliti"))
                    {
                        R("ble"); break;
                    }
                    break;
				
                case 'g': 
                    if (Ends("logi"))
                    {
                        R("log"); break;
                    }
                    break;
            }
        }
		
        /* step4() deals with -ic-, -full, -ness etc. similar strategy to step3. */
		
        private void  Step4()
        {
            switch (b[k])
            {
				
                case 'e': 
                    if (Ends("icate"))
                    {
                        R("ic"); break;
                    }
                    if (Ends("ative"))
                    {
                        R(""); break;
                    }
                    if (Ends("alize"))
                    {
                        R("al"); break;
                    }
                    break;
				
                case 'i': 
                    if (Ends("iciti"))
                    {
                        R("ic"); break;
                    }
                    break;
				
                case 'l': 
                    if (Ends("ical"))
                    {
                        R("ic"); break;
                    }
                    if (Ends("ful"))
                    {
                        R(""); break;
                    }
                    break;
				
                case 's': 
                    if (Ends("ness"))
                    {
                        R(""); break;
                    }
                    break;
            }
        }
		
        /* step5() takes off -ant, -ence etc., in context <c>vcvc<v>. */
		
        private void  Step5()
        {
            if (k == k0)
                return ; /* for Bug 1 */
            switch (b[k - 1])
            {
				
                case 'a': 
                    if (Ends("al"))
                        break;
                    return ;
				
                case 'c': 
                    if (Ends("ance"))
                        break;
                    if (Ends("ence"))
                        break;
                    return ;
				
                case 'e': 
                    if (Ends("er"))
                        break; return ;
				
                case 'i': 
                    if (Ends("ic"))
                        break; return ;
				
                case 'l': 
                    if (Ends("able"))
                        break;
                    if (Ends("ible"))
                        break; return ;
				
                case 'n': 
                    if (Ends("ant"))
                        break;
                    if (Ends("ement"))
                        break;
                    if (Ends("ment"))
                        break;
                    /* element etc. not stripped before the m */
                    if (Ends("ent"))
                        break;
                    return ;
				
                case 'o': 
                    if (Ends("ion") && j >= 0 && (b[j] == 's' || b[j] == 't'))
                        break;
                    /* j >= 0 fixes Bug 2 */
                    if (Ends("ou"))
                        break;
                    return ;
                    /* takes care of -ous */
				
                case 's': 
                    if (Ends("ism"))
                        break;
                    return ;
				
                case 't': 
                    if (Ends("ate"))
                        break;
                    if (Ends("iti"))
                        break;
                    return ;
				
                case 'u': 
                    if (Ends("ous"))
                        break;
                    return ;
				
                case 'v': 
                    if (Ends("ive"))
                        break;
                    return ;
				
                case 'z': 
                    if (Ends("ize"))
                        break;
                    return ;
				
                default: 
                    return ;
				
            }
            if (M() > 1)
                k = j;
        }
		
        /* step6() removes a final -e if m() > 1. */
		
        private void  Step6()
        {
            j = k;
            if (b[k] == 'e')
            {
                int a = M();
                if (a > 1 || a == 1 && !Cvc(k - 1))
                    k--;
            }
            if (b[k] == 'l' && Doublec(k) && M() > 1)
                k--;
        }
		
		
        /// <summary> Stem a word provided as a String.  Returns the result as a String.</summary>
        public virtual System.String Stem(System.String s)
        {
            if (Stem(s.ToCharArray(), s.Length))
            {
                return ToString();
            }
            else
                return s;
        }
		
        /// <summary>Stem a word contained in a char[].  Returns true if the stemming process
        /// resulted in a word different from the input.  You can retrieve the
        /// result with getResultLength()/getResultBuffer() or toString().
        /// </summary>
        public virtual bool Stem(char[] word)
        {
            return Stem(word, word.Length);
        }
		
        /// <summary>Stem a word contained in a portion of a char[] array.  Returns
        /// true if the stemming process resulted in a word different from
        /// the input.  You can retrieve the result with
        /// getResultLength()/getResultBuffer() or toString().
        /// </summary>
        public virtual bool Stem(char[] wordBuffer, int offset, int wordLen)
        {
            Reset();
            if (b.Length < wordLen)
            {
                char[] new_b = new char[wordLen + EXTRA];
                b = new_b;
            }
            Array.Copy(wordBuffer, offset, b, 0, wordLen);
            i = wordLen;
            return Stem(0);
        }
		
        /// <summary>Stem a word contained in a leading portion of a char[] array.
        /// Returns true if the stemming process resulted in a word different
        /// from the input.  You can retrieve the result with
        /// getResultLength()/getResultBuffer() or toString().
        /// </summary>
        public virtual bool Stem(char[] word, int wordLen)
        {
            return Stem(word, 0, wordLen);
        }
		
        /// <summary>Stem the word placed into the Stemmer buffer through calls to add().
        /// Returns true if the stemming process resulted in a word different
        /// from the input.  You can retrieve the result with
        /// getResultLength()/getResultBuffer() or toString().
        /// </summary>
        public virtual bool Stem()
        {
            return Stem(0);
        }
		
        public virtual bool Stem(int i0)
        {
            k = i - 1;
            k0 = i0;
            if (k > k0 + 1)
            {
                Step1(); Step2(); Step3(); Step4(); Step5(); Step6();
            }
            // Also, a word is considered dirty if we lopped off letters
            // Thanks to Ifigenia Vairelles for pointing this out.
            if (i != k + 1)
                dirty = true;
            i = k + 1;
            return dirty;
        }
		
        /// <summary>Test program for demonstrating the Stemmer.  It reads a file and
        /// stems each word, writing the result to standard out.
        /// Usage: Stemmer file-name
        /// </summary>
        [STAThread]
        public static void  Main(System.String[] args)
        {
            PorterStemmer s = new PorterStemmer();
			
            for (int i = 0; i < args.Length; i++)
            {
                try
                {
                    System.IO.BinaryReader in_Renamed = new System.IO.BinaryReader(System.IO.File.Open(args[i], System.IO.FileMode.Open, System.IO.FileAccess.Read));
                    byte[] buffer = new byte[1024];
                    int bufferLen, offset, ch;
					
                    bufferLen = in_Renamed.Read(buffer, 0, buffer.Length);
                    offset = 0;
                    s.Reset();
					
                    while (true)
                    {
                        if (offset < bufferLen)
                            ch = buffer[offset++];
                        else
                        {
                            bufferLen = in_Renamed.Read(buffer, 0, buffer.Length);
                            offset = 0;
                            if (bufferLen <= 0)
                                ch = - 1;
                            else
                                ch = buffer[offset++];
                        }
						
                        if (System.Char.IsLetter((char) ch))
                        {
                            s.Add(System.Char.ToLower((char) ch));
                        }
                        else
                        {
                            s.Stem();
                            System.Console.Out.Write(s.ToString());
                            s.Reset();
                            if (ch < 0)
                                break;
                            else
                            {
                                System.Console.Out.Write((char) ch);
                            }
                        }
                    }
					
                    in_Renamed.Close();
                }
                catch (System.IO.IOException)
                {
                    System.Console.Out.WriteLine("error reading " + args[i]);
                }
            }
        }
    }
}